/*
 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
 * SPDX-License-Identifier: MIT
 */

#include "radeon_compiler.h"

#include <stdbool.h>
#include <stdio.h>

#include "r300_reg.h"

#include "radeon_compiler_util.h"
#include "radeon_dataflow.h"
#include "radeon_list.h"
#include "radeon_program.h"
#include "radeon_program_alu.h"
#include "radeon_regalloc.h"
#include "radeon_remove_constants.h"
#include "radeon_swizzle.h"

#include "util/compiler.h"

/*
 * Take an already-setup and valid source then swizzle it appropriately to
 * obtain a constant ZERO or ONE source.
 */
#define __CONST(x, y)                                                                              \
   (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), t_swizzle(y), t_swizzle(y), t_swizzle(y),    \
                    t_swizzle(y), t_src_class(vpi->SrcReg[x].File), RC_MASK_NONE) |                \
    (vpi->SrcReg[x].RelAddr << 4))

static unsigned long
t_dst_mask(unsigned int mask)
{
   /* RC_MASK_* is equivalent to VSF_FLAG_* */
   return mask & RC_MASK_XYZW;
}

static unsigned long
t_dst_class(rc_register_file file)
{
   switch (file) {
   default:
      fprintf(stderr, "%s: Bad register file %i\n", __func__, file);
      FALLTHROUGH;
   case RC_FILE_TEMPORARY:
      return PVS_DST_REG_TEMPORARY;
   case RC_FILE_OUTPUT:
      return PVS_DST_REG_OUT;
   case RC_FILE_ADDRESS:
      return PVS_DST_REG_A0;
   }
}

static unsigned long
t_dst_index(struct r300_vertex_program_code *vp, struct rc_dst_register *dst)
{
   if (dst->File == RC_FILE_OUTPUT)
      return vp->outputs[dst->Index];

   return dst->Index;
}

static unsigned long
t_src_class(rc_register_file file)
{
   switch (file) {
   default:
      fprintf(stderr, "%s: Bad register file %i\n", __func__, file);
      FALLTHROUGH;
   case RC_FILE_NONE:
   case RC_FILE_TEMPORARY:
      return PVS_SRC_REG_TEMPORARY;
   case RC_FILE_INPUT:
      return PVS_SRC_REG_INPUT;
   case RC_FILE_CONSTANT:
      return PVS_SRC_REG_CONSTANT;
   }
}

static int
t_src_conflict(struct rc_src_register a, struct rc_src_register b)
{
   unsigned long aclass = t_src_class(a.File);
   unsigned long bclass = t_src_class(b.File);

   if (aclass != bclass)
      return 0;
   if (aclass == PVS_SRC_REG_TEMPORARY)
      return 0;

   if (a.RelAddr || b.RelAddr)
      return 1;
   if (a.Index != b.Index)
      return 1;

   return 0;
}

static inline unsigned long
t_swizzle(unsigned int swizzle)
{
   /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
   return swizzle;
}

static unsigned long
t_src_index(struct r300_vertex_program_code *vp, struct rc_src_register *src)
{
   if (src->File == RC_FILE_INPUT) {
      assert(vp->inputs[src->Index] != -1);
      return vp->inputs[src->Index];
   } else {
      if (src->Index < 0) {
         fprintf(stderr, "negative offsets for indirect addressing do not work.\n");
         return 0;
      }
      return src->Index;
   }
}

/* these two functions should probably be merged... */

static unsigned long
t_src(struct r300_vertex_program_code *vp, struct rc_src_register *src)
{
   /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
    * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
    */
   return PVS_SRC_OPERAND(t_src_index(vp, src), t_swizzle(GET_SWZ(src->Swizzle, 0)),
                          t_swizzle(GET_SWZ(src->Swizzle, 1)), t_swizzle(GET_SWZ(src->Swizzle, 2)),
                          t_swizzle(GET_SWZ(src->Swizzle, 3)), t_src_class(src->File),
                          src->Negate) |
          (src->RelAddr << 4) | (src->Abs << 3);
}

static unsigned long
t_src_scalar(struct r300_vertex_program_code *vp, struct rc_src_register *src)
{
   /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
    * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
    */
   unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);

   return PVS_SRC_OPERAND(t_src_index(vp, src), t_swizzle(swz), t_swizzle(swz), t_swizzle(swz),
                          t_swizzle(swz), t_src_class(src->File),
                          src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
          (src->RelAddr << 4) | (src->Abs << 3);
}

static int
valid_dst(struct r300_vertex_program_code *vp, struct rc_dst_register *dst)
{
   if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
      return 0;
   } else if (dst->File == RC_FILE_ADDRESS) {
      assert(dst->Index == 0);
   }

   return 1;
}

static void
ei_vector1(struct r300_vertex_program_code *vp, unsigned int hw_opcode,
           struct rc_sub_instruction *vpi, unsigned int *inst)
{
   inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 0, 0, t_dst_index(vp, &vpi->DstReg),
                                t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
                                vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
   inst[1] = t_src(vp, &vpi->SrcReg[0]);
   inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
   inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
}

static void
ei_vector2(struct r300_vertex_program_code *vp, unsigned int hw_opcode,
           struct rc_sub_instruction *vpi, unsigned int *inst)
{
   inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 0, 0, t_dst_index(vp, &vpi->DstReg),
                                t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
                                vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
   inst[1] = t_src(vp, &vpi->SrcReg[0]);
   inst[2] = t_src(vp, &vpi->SrcReg[1]);
   inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
}

static void
ei_math1(struct r300_vertex_program_code *vp, unsigned int hw_opcode,
         struct rc_sub_instruction *vpi, unsigned int *inst)
{
   inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 1, 0, t_dst_index(vp, &vpi->DstReg),
                                t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
                                vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
   inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
   inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
   inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
}

static void
ei_math1_select(struct r300_vertex_program_code *vp,
                unsigned math_mode,
                unsigned hw_opcode_ieee,
                unsigned hw_opcode_dx,
                unsigned hw_opcode_ff,
                struct rc_sub_instruction *vpi,
                unsigned int *inst)
{
   unsigned hw_opcode;
   switch (math_mode) {
   case RC_MATH_IEEE: hw_opcode = hw_opcode_ieee; break;
   case RC_MATH_DX: hw_opcode = hw_opcode_dx; break;
   case RC_MATH_FF: hw_opcode = hw_opcode_ff; break;
   default:
      unreachable();
   }
   ei_math1(vp, hw_opcode, vpi, inst);
}

static void
ei_cmp(struct r300_vertex_program_code *vp, struct rc_sub_instruction *vpi, unsigned int *inst)
{
   inst[0] = PVS_OP_DST_OPERAND(VE_COND_MUX_GTE, 0, 0, t_dst_index(vp, &vpi->DstReg),
                                t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
                                vpi->SaturateMode == RC_SATURATE_ZERO_ONE);

   /* Arguments with constant swizzles still count as a unique
    * temporary, so we should make sure these arguments share a
    * register index with one of the other arguments. */
   for (unsigned i = 0; i < 3; i++) {
      unsigned j = (i + 1) % 3;
      if (vpi->SrcReg[i].File == RC_FILE_NONE &&
          (vpi->SrcReg[j].File == RC_FILE_NONE || vpi->SrcReg[j].File == RC_FILE_TEMPORARY)) {
         vpi->SrcReg[i].Index = vpi->SrcReg[j].Index;
         break;
      }
   }

   inst[1] = t_src(vp, &vpi->SrcReg[0]);
   inst[2] = t_src(vp, &vpi->SrcReg[2]);
   inst[3] = t_src(vp, &vpi->SrcReg[1]);
}

static void
ei_lit(struct r300_vertex_program_code *vp, struct rc_sub_instruction *vpi, unsigned int *inst)
{
   // LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}

   inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX, 1, 0, t_dst_index(vp, &vpi->DstReg),
                                t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
                                vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
   /* NOTE: Users swizzling might not work. */
   inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]),
                             t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
                             t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
                             PVS_SRC_SELECT_FORCE_0,                        // Z
                             t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
                             t_src_class(vpi->SrcReg[0].File),
                             vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
             (vpi->SrcReg[0].RelAddr << 4);
   inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]),
                             t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
                             t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
                             PVS_SRC_SELECT_FORCE_0,                        // Z
                             t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
                             t_src_class(vpi->SrcReg[0].File),
                             vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
             (vpi->SrcReg[0].RelAddr << 4);
   inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]),
                             t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
                             t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
                             PVS_SRC_SELECT_FORCE_0,                        // Z
                             t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
                             t_src_class(vpi->SrcReg[0].File),
                             vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
             (vpi->SrcReg[0].RelAddr << 4);
}

static void
ei_mad(struct r300_vertex_program_code *vp, struct rc_sub_instruction *vpi, unsigned int *inst)
{
   unsigned int i;
   /* Remarks about hardware limitations of MAD
    * (please preserve this comment, as this information is _NOT_
    * in the documentation provided by AMD).
    *
    * As described in the documentation, MAD with three unique temporary
    * source registers requires the use of the macro version.
    *
    * However (and this is not mentioned in the documentation), apparently
    * the macro version is _NOT_ a full superset of the normal version.
    * In particular, the macro version does not always work when relative
    * addressing is used in the source operands.
    *
    * This limitation caused incorrect rendering in Sauerbraten's OpenGL
    * assembly shader path when using medium quality animations
    * (i.e. animations with matrix blending instead of quaternion blending).
    *
    * Unfortunately, I (nha) have been unable to extract a Piglit regression
    * test for this issue - for some reason, it is possible to have vertex
    * programs whose prefix is *exactly* the same as the prefix of the
    * offending program in Sauerbraten up to the offending instruction
    * without causing any trouble.
    *
    * Bottom line: Only use the macro version only when really necessary;
    * according to AMD docs, this should improve performance by one clock
    * as a nice side bonus.
    */
   if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY && vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
       vpi->SrcReg[2].File == RC_FILE_TEMPORARY && vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
       vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
       vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
      inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD, 0, 1, t_dst_index(vp, &vpi->DstReg),
                                   t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
                                   vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
   } else {
      inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD, 0, 0, t_dst_index(vp, &vpi->DstReg),
                                   t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
                                   vpi->SaturateMode == RC_SATURATE_ZERO_ONE);

      /* Arguments with constant swizzles still count as a unique
       * temporary, so we should make sure these arguments share a
       * register index with one of the other arguments. */
      for (i = 0; i < 3; i++) {
         unsigned int j;
         if (vpi->SrcReg[i].File != RC_FILE_NONE)
            continue;

         for (j = 0; j < 3; j++) {
            if (i != j) {
               vpi->SrcReg[i].Index = vpi->SrcReg[j].Index;
               break;
            }
         }
      }
   }
   inst[1] = t_src(vp, &vpi->SrcReg[0]);
   inst[2] = t_src(vp, &vpi->SrcReg[1]);
   inst[3] = t_src(vp, &vpi->SrcReg[2]);
}

static void
ei_pow(struct r300_vertex_program_code *vp, struct rc_sub_instruction *vpi, unsigned int *inst)
{
   inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF, 1, 0, t_dst_index(vp, &vpi->DstReg),
                                t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File),
                                vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
   inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
   inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
   inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
}

static void
translate_vertex_program(struct radeon_compiler *c, void *user)
{
   struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler *)c;
   struct rc_instruction *rci;

   unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {};
   unsigned loop_depth = 0;
   bool last_input_read_at_loop_end = false;
   bool last_pos_write_at_loop_end = false;

   compiler->code->pos_end = 0; /* Not supported yet */
   compiler->code->length = 0;
   compiler->code->num_temporaries = 0;
   compiler->code->last_input_read = 0;
   compiler->code->last_pos_write = 0;

   compiler->SetHwInputOutput(compiler);

   for (rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions;
        rci = rci->Next) {
      struct rc_sub_instruction *vpi = &rci->U.I;
      unsigned int *inst = compiler->code->body.d + compiler->code->length;
      const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);

      /* Skip instructions writing to non-existing destination */
      if (!valid_dst(compiler->code, &vpi->DstReg))
         continue;

      if (info->HasDstReg) {
         /* Neither is Saturate. */
         if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {
            rc_error(&compiler->Base, "Vertex program does not support the Saturate "
                                      "modifier (yet).\n");
         }
      }

      if (compiler->code->length >= c->max_alu_insts * 4) {
         rc_error(&compiler->Base, "Vertex program has too many instructions\n");
         return;
      }

      assert(compiler->Base.is_r500 ||
             (vpi->Opcode != RC_OPCODE_SEQ && vpi->Opcode != RC_OPCODE_SNE));

      switch (vpi->Opcode) {
      case RC_OPCODE_ADD:
         ei_vector2(compiler->code, VE_ADD, vpi, inst);
         break;
      case RC_OPCODE_ARL:
         ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst);
         break;
      case RC_OPCODE_ARR:
         ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst);
         break;
      case RC_OPCODE_COS:
         ei_math1(compiler->code, ME_COS, vpi, inst);
         break;
      case RC_OPCODE_CMP:
         ei_cmp(compiler->code, vpi, inst);
         break;
      case RC_OPCODE_DP4:
         ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst);
         break;
      case RC_OPCODE_DST:
         ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst);
         break;
      case RC_OPCODE_EX2:
         ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst);
         break;
      case RC_OPCODE_EXP:
         ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst);
         break;
      case RC_OPCODE_FRC:
         ei_vector1(compiler->code, VE_FRACTION, vpi, inst);
         break;
      case RC_OPCODE_LG2:
         ei_math1_select(compiler->code, compiler->Base.math_rules, ME_LOG_BASE2_IEEE,
                         ME_LOG_BASE2_FULL_DX, ME_LOG_BASE2_FULL_DX, vpi, inst);
         break;
      case RC_OPCODE_LIT:
         ei_lit(compiler->code, vpi, inst);
         break;
      case RC_OPCODE_LOG:
         ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst);
         break;
      case RC_OPCODE_MAD:
         ei_mad(compiler->code, vpi, inst);
         break;
      case RC_OPCODE_MAX:
         ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst);
         break;
      case RC_OPCODE_MIN:
         ei_vector2(compiler->code, VE_MINIMUM, vpi, inst);
         break;
      case RC_OPCODE_MOV:
         ei_vector1(compiler->code, VE_ADD, vpi, inst);
         break;
      case RC_OPCODE_MUL:
         ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst);
         break;
      case RC_OPCODE_POW:
         ei_pow(compiler->code, vpi, inst);
         break;
      case RC_OPCODE_RCP:
         ei_math1_select(compiler->code, compiler->Base.math_rules, ME_RECIP_IEEE,
                         ME_RECIP_DX, ME_RECIP_FF, vpi, inst);
         break;
      case RC_OPCODE_RSQ:
         ei_math1_select(compiler->code, compiler->Base.math_rules, ME_RECIP_SQRT_IEEE,
                         ME_RECIP_SQRT_DX, ME_RECIP_SQRT_FF, vpi, inst);
         break;
      case RC_OPCODE_SEQ:
         ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst);
         break;
      case RC_OPCODE_SGE:
         ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst);
         break;
      case RC_OPCODE_SIN:
         ei_math1(compiler->code, ME_SIN, vpi, inst);
         break;
      case RC_OPCODE_SLT:
         ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst);
         break;
      case RC_OPCODE_SNE:
         ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst);
         break;
      case RC_OPCODE_BGNLOOP: {
         if ((!compiler->Base.is_r500 && loop_depth >= R300_VS_MAX_LOOP_DEPTH) ||
             loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
            rc_error(&compiler->Base, "Loops are nested too deep.");
            return;
         }
         loops[loop_depth++] = ((compiler->code->length) / 4) + 1;
         break;
      }
      case RC_OPCODE_ENDLOOP: {
         unsigned int act_addr;
         unsigned int last_addr;
         unsigned int ret_addr;

         if (loop_depth == 1 && last_input_read_at_loop_end) {
            compiler->code->last_input_read = compiler->code->length / 4;
            last_input_read_at_loop_end = false;
         }
         if (loop_depth == 1 && last_pos_write_at_loop_end) {
            compiler->code->last_pos_write = compiler->code->length / 4;
            last_pos_write_at_loop_end = false;
         }

         ret_addr = loops[--loop_depth];
         act_addr = ret_addr - 1;
         last_addr = (compiler->code->length / 4) - 1;

         if (loop_depth >= R300_VS_MAX_FC_OPS) {
            rc_error(&compiler->Base, "Too many flow control instructions.");
            return;
         }
         /* Maximum of R500_PVS_FC_LOOP_CNT_JMP_INST is 0xff, here
          * we reduce it to half to avoid occasional hangs on RV516
          * and downclocked RV530.
          */
         if (compiler->Base.is_r500) {
            compiler->code->fc_op_addrs.r500[compiler->code->num_fc_ops].lw =
               R500_PVS_FC_ACT_ADRS(act_addr) | R500_PVS_FC_LOOP_CNT_JMP_INST(0x0080);
            compiler->code->fc_op_addrs.r500[compiler->code->num_fc_ops].uw =
               R500_PVS_FC_LAST_INST(last_addr) | R500_PVS_FC_RTN_INST(ret_addr);
         } else {
            compiler->code->fc_op_addrs.r300[compiler->code->num_fc_ops] =
               R300_PVS_FC_ACT_ADRS(act_addr) | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff) |
               R300_PVS_FC_LAST_INST(last_addr) | R300_PVS_FC_RTN_INST(ret_addr);
         }
         compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
            R300_PVS_FC_LOOP_INIT_VAL(0x0) | R300_PVS_FC_LOOP_STEP_VAL(0x1);
         compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(compiler->code->num_fc_ops);
         compiler->code->num_fc_ops++;

         break;
      }

      case RC_ME_PRED_SET_CLR:
         ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
         break;

      case RC_ME_PRED_SET_INV:
         ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
         break;

      case RC_ME_PRED_SET_POP:
         ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
         break;

      case RC_ME_PRED_SET_RESTORE:
         ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
         break;

      case RC_ME_PRED_SEQ:
         ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
         break;

      case RC_ME_PRED_SNEQ:
         ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
         break;

      case RC_VE_PRED_SNEQ_PUSH:
         ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH, vpi, inst);
         break;

      default:
         rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
         return;
      }

      if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
         inst[0] |= (PVS_DST_PRED_ENABLE_MASK << PVS_DST_PRED_ENABLE_SHIFT);
         if (vpi->DstReg.Pred == RC_PRED_SET) {
            inst[0] |= (PVS_DST_PRED_SENSE_MASK << PVS_DST_PRED_SENSE_SHIFT);
         }
      }

      /* Update the number of temporaries. */
      if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
          vpi->DstReg.Index >= compiler->code->num_temporaries)
         compiler->code->num_temporaries = vpi->DstReg.Index + 1;

      /* last instruction that writes position */
      if (info->HasDstReg && vpi->DstReg.File == RC_FILE_OUTPUT &&
          t_dst_index(compiler->code, &vpi->DstReg) == 0) {
         if (loop_depth == 0)
            compiler->code->last_pos_write = compiler->code->length / 4;
         else
            last_pos_write_at_loop_end = true;
      }

      for (unsigned i = 0; i < info->NumSrcRegs; i++) {
         if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
             vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
            compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
         if (vpi->SrcReg[i].File == RC_FILE_INPUT) {
            if (loop_depth == 0)
               compiler->code->last_input_read = compiler->code->length / 4;
            else
               last_input_read_at_loop_end = true;
         }
      }

      if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
         rc_error(&compiler->Base, "Too many temporaries.\n");
         return;
      }

      compiler->code->length += 4;

      if (compiler->Base.Error)
         return;
   }
}

struct temporary_allocation {
   unsigned int Allocated : 1;
   unsigned int HwTemp : 15;
   struct rc_instruction *LastRead;
};

static int
get_reg(struct radeon_compiler *c, struct temporary_allocation *ta, bool *hwtemps,
        unsigned int orig)
{
   if (!ta[orig].Allocated) {
      int j;
      for (j = 0; j < c->max_temp_regs; ++j) {
         if (!hwtemps[j])
            break;
      }
      ta[orig].Allocated = 1;
      ta[orig].HwTemp = j;
      hwtemps[ta[orig].HwTemp] = true;
   }

   return ta[orig].HwTemp;
}

static void
allocate_temporary_registers(struct radeon_compiler *c, void *user)
{
   unsigned int node_count, node_index;
   struct ra_class **node_classes;
   struct rc_list *var_ptr;
   struct rc_list *variables;
   struct ra_graph *graph;
   const struct rc_regalloc_state *ra_state = c->regalloc_state;

   rc_recompute_ips(c);

   /* Get list of program variables */
   variables = rc_get_variables(c);
   node_count = rc_list_count(variables);
   node_classes = memory_pool_malloc(&c->Pool, node_count * sizeof(struct ra_class *));

   for (var_ptr = variables, node_index = 0; var_ptr; var_ptr = var_ptr->Next, node_index++) {
      unsigned int class_index = 0;
      int index;
      /* Compute the live intervals */
      rc_variable_compute_live_intervals(var_ptr->Item);
      unsigned int writemask = rc_variable_writemask_sum(var_ptr->Item);
      index = rc_find_class(c->regalloc_state->class_list, writemask, 6);
      if (index > -1) {
         class_index = c->regalloc_state->class_list[index].ID;
      } else {
         rc_error(c, "Could not find class for index=%u mask=%u\n",
                  ((struct rc_variable *)var_ptr->Item)->Dst.Index, writemask);
      }
      node_classes[node_index] = ra_state->classes[class_index];
   }

   graph = ra_alloc_interference_graph(ra_state->regs, node_count);

   for (node_index = 0; node_index < node_count; node_index++) {
      ra_set_node_class(graph, node_index, node_classes[node_index]);
   }

   rc_build_interference_graph(graph, variables);

   if (!ra_allocate(graph)) {
      rc_error(c, "Ran out of hardware temporaries\n");
      ralloc_free(graph);
      return;
   }

   /* Rewrite the registers */
   for (var_ptr = variables, node_index = 0; var_ptr; var_ptr = var_ptr->Next, node_index++) {
      int reg = ra_get_node_reg(graph, node_index);
      unsigned int writemask = reg_get_writemask(reg);
      unsigned int index = reg_get_index(reg);
      struct rc_variable *var = var_ptr->Item;

      rc_variable_change_dst(var, index, writemask);
   }

   ralloc_free(graph);
}

/**
 * Vertex engine cannot read two inputs or two constants at the same time.
 * Introduce intermediate MOVs to temporary registers to account for this.
 */
static int
transform_source_conflicts(struct radeon_compiler *c, struct rc_instruction *inst, void *unused)
{
   const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);

   if (opcode->NumSrcRegs == 3) {
      if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2]) ||
          t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
         int tmpreg = rc_find_free_temporary(c);
         struct rc_instruction *inst_mov = rc_insert_new_instruction(c, inst->Prev);
         inst_mov->U.I.Opcode = RC_OPCODE_MOV;
         inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
         inst_mov->U.I.DstReg.Index = tmpreg;
         inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
         inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
         inst_mov->U.I.SrcReg[0].Negate = 0;
         inst_mov->U.I.SrcReg[0].Abs = 0;

         inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
         inst->U.I.SrcReg[2].Index = tmpreg;
         inst->U.I.SrcReg[2].RelAddr = false;
      }
   }

   if (opcode->NumSrcRegs >= 2) {
      if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
         int tmpreg = rc_find_free_temporary(c);
         struct rc_instruction *inst_mov = rc_insert_new_instruction(c, inst->Prev);
         inst_mov->U.I.Opcode = RC_OPCODE_MOV;
         inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
         inst_mov->U.I.DstReg.Index = tmpreg;
         inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
         inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
         inst_mov->U.I.SrcReg[0].Negate = 0;
         inst_mov->U.I.SrcReg[0].Abs = 0;

         inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
         inst->U.I.SrcReg[1].Index = tmpreg;
         inst->U.I.SrcReg[1].RelAddr = false;
      }
   }

   return 1;
}

static void
rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
{
   struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler *)c;
   int i;

   for (i = 0; i < 32; ++i) {
      if ((compiler->RequiredOutputs & (1U << i)) &&
          !(compiler->Base.Program.OutputsWritten & (1U << i))) {
         struct rc_instruction *inst =
            rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
         inst->U.I.Opcode = RC_OPCODE_MOV;

         inst->U.I.DstReg.File = RC_FILE_OUTPUT;
         inst->U.I.DstReg.Index = i;
         inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;

         inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
         inst->U.I.SrcReg[0].Index = 0;
         inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;

         compiler->Base.Program.OutputsWritten |= 1U << i;
      }
   }
}

static int
swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
{
   (void)opcode;
   (void)reg;

   return 1;
}

const struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
   .IsNative = &swizzle_is_native, .Split = NULL /* should never be called */
};

void
r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
{
   int is_r500 = c->Base.is_r500;
   int opt = !c->Base.disable_optimizations;
   bool debug = c->Base.Debug & RC_DBG_LOG;

   /* Lists of instruction transformations. */
   struct radeon_program_transformation alu_rewrite[] = {{&r300_transform_vertex_alu, NULL},
                                                         {NULL, NULL}};

   struct radeon_program_transformation resolve_src_conflicts[] = {
      {&transform_source_conflicts, NULL},
      {NULL, NULL}};

   /* List of compiler passes. */
   struct radeon_compiler_pass vs_list[] = {
      /* clang-format off */
      /* NAME                        DUMP PREDICATE FUNCTION                      PARAM */
      {"add artificial outputs",     0,   1,        rc_vs_add_artificial_outputs, NULL},
      {"native rewrite",             1,   1,        rc_local_transform,           alu_rewrite},
      {"unused channels",            1,   opt,      rc_mark_unused_channels,      NULL},
      {"dataflow optimize",          1,   opt,      rc_optimize,                  NULL},
      {"dead constants",             1,   1,        rc_remove_unused_constants,   &c->code->constants_remap_table},
      /* This pass must be done after optimizations. */
      {"source conflict resolve",    1,   1,        rc_local_transform,           resolve_src_conflicts},
      {"register allocation",        1,   opt,      allocate_temporary_registers, NULL},
      {"lower control flow opcodes", 1,   is_r500,  rc_vert_fc,                   NULL},
      {"final code validation",      0,   1,        rc_validate_final_shader,     NULL},
      {"machine code generation",    0,   1,        translate_vertex_program,     NULL},
      {"dump machine code",          0,   debug,    r300_vertex_program_dump,     NULL},
      {NULL,                         0,   0,        NULL, NULL}};
   /* clang-format on */

   c->Base.type = RC_VERTEX_PROGRAM;
   c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;

   rc_run_compiler(&c->Base, vs_list);

   c->code->InputsRead = c->Base.Program.InputsRead;
   c->code->OutputsWritten = c->Base.Program.OutputsWritten;
   rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
}
